{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2f426f65-7904-4231-92a5-e899547d90ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install minsearch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "63b0fd25-41a2-48ad-b9bf-3f1265308bd4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import minsearch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9b37cce2-ed84-408b-9106-d61360aa82c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5bdaf6ce-2540-494f-989c-5b94b1b6626c",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('documents.json', 'rt') as f_in:\n",
    "    docs_raw = json.load(f_in)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "42b9b1f9-3c90-42b0-beb4-cb419f9cdcea",
   "metadata": {},
   "outputs": [],
   "source": [
    "documents = []\n",
    "\n",
    "for course_dict in docs_raw:\n",
    "    for doc in course_dict['documents']:\n",
    "        doc['course'] = course_dict['course']\n",
    "        documents.append(doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "57de60e5-b96c-499c-a7cf-0f30fc33b324",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'text': \"The purpose of this document is to capture frequently asked technical questions\\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\\nSubscribe to course public Google Calendar (it works from Desktop only).\\nRegister before the course starts using this link.\\nJoin the course Telegram channel with announcements.\\nDon’t forget to register in DataTalks.Club's Slack and join the channel.\",\n",
       " 'section': 'General course-related questions',\n",
       " 'question': 'Course - When will the course start?',\n",
       " 'course': 'data-engineering-zoomcamp'}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c499838b-73b3-44be-8ba6-f46d3693aa59",
   "metadata": {},
   "outputs": [],
   "source": [
    "index = minsearch.Index(\n",
    "    text_fields=[\"question\", \"text\", \"section\"],\n",
    "    keyword_fields=[\"course\"]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e201f0f3-b380-43a5-95ee-62b1b79002d1",
   "metadata": {},
   "source": [
    "SELECT * WHERE course = 'data-engineering-zoomcamp';"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "a8d8ea88-7412-49c1-8a8e-44d0d0862a17",
   "metadata": {},
   "outputs": [],
   "source": [
    "q = 'the course has already started, can I still enroll?'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ce7d0d18-5c07-4010-9f90-bbd021f110c8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<minsearch.Index at 0x7d0d016b8760>"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index.fit(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "aa755a08-b98d-4e92-8994-04e6108499d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "from openai import OpenAI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "ef8e9cdc-dfd4-4e54-a332-4b9bde4e6047",
   "metadata": {},
   "outputs": [],
   "source": [
    "client = OpenAI()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "e7da9664-ecb3-4d89-87da-9b2b942444d0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"It's not uncommon for courses to accept enrollments even after they have started, but policies can vary widely depending on the institution or provider offering the course. Here are a few steps you can take to find out if you can still enroll:\\n\\n1. **Check the Course Platform**: If the course is offered online, visit the course's webpage for information about late enrollment policies.\\n\\n2. **Contact the Instructor**: Reach out to the course instructor or lead facilitator. They may be willing to make an exception or provide you with the necessary information.\\n\\n3. **Reach Out to Administrative Offices**: Contact the academic or administrative office responsible for course enrollments. This might be the registrar's office, student services, or a similar department.\\n\\n4. **Review Deadlines and Policies**: Look for any publicly available documentation outlining the deadlines and policies regarding late enrollments.\\n\\n5. **Consider Catching Up**: Be prepared to quickly catch up on any missed material if you are allowed to enroll late. This shows commitment and can make it easier for instructors or administration to accommodate your request.\\n\\nRemember, clear and courteous communication will always help you in these situations. Good luck!\""
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "response = client.chat.completions.create(\n",
    "    model='gpt-4o',\n",
    "    messages=[{\"role\": \"user\", \"content\": q}]\n",
    ")\n",
    "\n",
    "response.choices[0].message.content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "b21237c3-80e9-429c-a089-d45428087046",
   "metadata": {},
   "outputs": [],
   "source": [
    "def search(query):\n",
    "    boost = {'question': 3.0, 'section': 0.5}\n",
    "\n",
    "    results = index.search(\n",
    "        query=query,\n",
    "        filter_dict={'course': 'data-engineering-zoomcamp'},\n",
    "        boost_dict=boost,\n",
    "        num_results=5\n",
    "    )\n",
    "\n",
    "    return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "8cc5784e-6515-42e5-be62-8fb915df1088",
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_prompt(query, search_results):\n",
    "    prompt_template = \"\"\"\n",
    "You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\n",
    "Use only the facts from the CONTEXT when answering the QUESTION.\n",
    "\n",
    "QUESTION: {question}\n",
    "\n",
    "CONTEXT: \n",
    "{context}\n",
    "\"\"\".strip()\n",
    "\n",
    "    context = \"\"\n",
    "    \n",
    "    for doc in search_results:\n",
    "        context = context + f\"section: {doc['section']}\\nquestion: {doc['question']}\\nanswer: {doc['text']}\\n\\n\"\n",
    "    \n",
    "    prompt = prompt_template.format(question=query, context=context).strip()\n",
    "    return prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "97d35dec-c25f-472d-b961-20d5c30902ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "def llm(prompt):\n",
    "    response = client.chat.completions.create(\n",
    "        model='gpt-4o',\n",
    "        messages=[{\"role\": \"user\", \"content\": prompt}]\n",
    "    )\n",
    "    \n",
    "    return response.choices[0].message.content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "8602f40b-ad3b-49c9-b3cc-051a79c888bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "query = 'how do I run kafka?'\n",
    "\n",
    "def rag(query):\n",
    "    search_results = search(query)\n",
    "    prompt = build_prompt(query, search_results)\n",
    "    answer = llm(prompt)\n",
    "    return answer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "5fd4497b-c5d5-4258-b950-6b35d1af4ec5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"To run Kafka, follow the relevant instructions based on your use case:\\n\\n### For Java:\\nNavigate to your project directory and use the following command in the terminal to run a Kafka producer/consumer/KStreams, etc.:\\n```shell\\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\\n```\\nReplace `<jar_name>` with the actual name of your JAR file.\\n\\n### For Python:\\nIf you're running Python Kafka, ensure you have your virtual environment set up and activate it. Here’s how you can do it:\\n\\n1. **Create a virtual environment and install required packages (run only once):**\\n    ```shell\\n    python -m venv env\\n    source env/bin/activate\\n    pip install -r ../requirements.txt\\n    ```\\n\\n2. **Activate the virtual environment (run this every time you need it):**\\n    ```shell\\n    source env/bin/activate\\n    ```\\n\\n3. **Deactivate the virtual environment when done:**\\n    ```shell\\n    deactivate\\n    ```\\n\\nNote: On Windows, the activation command would be slightly different:\\n```shell\\nenv\\\\Scripts\\\\activate\\n```\\n\\nMake sure that your Docker images are up and running if they are part of your setup.\""
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rag(query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "385b012f-4905-422d-8d7c-3d542dfe5a7c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Yes, you can still enroll in the course even after it has started. You are eligible to submit homework assignments, but please be mindful of the deadlines for the final projects to ensure you complete everything on time.'"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rag('the course has already started, can I still enroll?')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "d3e04fb3-b7f7-4e53-8de9-a1c6cde3f038",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'text': \"The purpose of this document is to capture frequently asked technical questions\\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\\nSubscribe to course public Google Calendar (it works from Desktop only).\\nRegister before the course starts using this link.\\nJoin the course Telegram channel with announcements.\\nDon’t forget to register in DataTalks.Club's Slack and join the channel.\",\n",
       " 'section': 'General course-related questions',\n",
       " 'question': 'Course - When will the course start?',\n",
       " 'course': 'data-engineering-zoomcamp'}"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "2c05052f-a85a-4137-8398-0fd0be678599",
   "metadata": {},
   "outputs": [],
   "source": [
    "from elasticsearch import Elasticsearch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "a78df1cc-5a5a-40b4-b673-19c7f0319453",
   "metadata": {},
   "outputs": [],
   "source": [
    "es_client = Elasticsearch('http://localhost:9200') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "c9367c18-41ad-495e-9920-1a0c552f0d18",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index_settings = {\n",
    "    \"settings\": {\n",
    "        \"number_of_shards\": 1,\n",
    "        \"number_of_replicas\": 0\n",
    "    },\n",
    "    \"mappings\": {\n",
    "        \"properties\": {\n",
    "            \"text\": {\"type\": \"text\"},\n",
    "            \"section\": {\"type\": \"text\"},\n",
    "            \"question\": {\"type\": \"text\"},\n",
    "            \"course\": {\"type\": \"keyword\"} \n",
    "        }\n",
    "    }\n",
    "}\n",
    "\n",
    "index_name = \"course-questions\"\n",
    "\n",
    "es_client.indices.create(index=index_name, body=index_settings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "9f778c93-a5b6-4634-b42e-0c25083a2512",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'text': \"The purpose of this document is to capture frequently asked technical questions\\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\\nSubscribe to course public Google Calendar (it works from Desktop only).\\nRegister before the course starts using this link.\\nJoin the course Telegram channel with announcements.\\nDon’t forget to register in DataTalks.Club's Slack and join the channel.\",\n",
       " 'section': 'General course-related questions',\n",
       " 'question': 'Course - When will the course start?',\n",
       " 'course': 'data-engineering-zoomcamp'}"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "5c230059-e219-4a13-a7f8-ede4cf1b028f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/python/3.10.13/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "from tqdm.auto import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "70fe3c97-916d-42c0-bd7b-4f42d9056409",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████| 948/948 [00:28<00:00, 33.07it/s]\n"
     ]
    }
   ],
   "source": [
    "for doc in tqdm(documents):\n",
    "    es_client.index(index=index_name, document=doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "f1bc1244-b8dc-4228-8171-c0507004db93",
   "metadata": {},
   "outputs": [],
   "source": [
    "query = 'I just disovered the course. Can I still join it?'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "7c72e000-910b-4fb5-aa88-2561e7bc39f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def elastic_search(query):\n",
    "    search_query = {\n",
    "        \"size\": 5,\n",
    "        \"query\": {\n",
    "            \"bool\": {\n",
    "                \"must\": {\n",
    "                    \"multi_match\": {\n",
    "                        \"query\": query,\n",
    "                        \"fields\": [\"question^3\", \"text\", \"section\"],\n",
    "                        \"type\": \"best_fields\"\n",
    "                    }\n",
    "                },\n",
    "                \"filter\": {\n",
    "                    \"term\": {\n",
    "                        \"course\": \"data-engineering-zoomcamp\"\n",
    "                    }\n",
    "                }\n",
    "            }\n",
    "        }\n",
    "    }\n",
    "\n",
    "    response = es_client.search(index=index_name, body=search_query)\n",
    "    \n",
    "    result_docs = []\n",
    "    \n",
    "    for hit in response['hits']['hits']:\n",
    "        result_docs.append(hit['_source'])\n",
    "    \n",
    "    return result_docs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "81abecbc-eb6b-428f-ab7d-7e21f58b64de",
   "metadata": {},
   "outputs": [],
   "source": [
    "def rag(query):\n",
    "    search_results = elastic_search(query)\n",
    "    prompt = build_prompt(query, search_results)\n",
    "    answer = llm(prompt)\n",
    "    return answer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "0ea9315a-a619-4066-9e90-8c260f2c8450",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Yes, you can still join the course even if you discovered it after the start date. You are eligible to submit the homeworks, but be mindful of the deadlines for turning in the final projects. So make sure not to leave everything for the last minute.'"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rag(query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d8095274-c9cd-4fd5-80d2-069fc951834f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
